Exact call used to run the simulation

Simulation parameters: CA, continuous outcome (AVG MONTHLY EARNINGS IN 2006 DOLLARS), train set of 1000, small covariate set, with sigma 0.2, with proportion treated set to proportion in real data. All models except SL, all queens including SL, 100 iters. Parallel.

ca_simulation <- run_simulation(
                                # Required arguments
                                real_data = ca_subset_imputed, 
                                realdata_file_path = NULL,
                                dataset_name = "ca",    
                                covariates = ca_small_set,
                                outcome = "Y18JBERNA_06",
                                treatment = ca_treatment,
                                
                                # Optional arguments
                                large_covariate_set = ca_large_set, 
                                small_covariate_set = ca_small_set, 
                                all_outcomes = ca_outcomes,
                                S = 100,  
                                queen_list = DEFAULT_QUEEN_LIST,
                                p_tx = NULL,
                                size_train = 1000,
                                size_test = 10000, 
                                PARALLEL = TRUE,
                                verbose = 1000,
                                include_LASSO = TRUE,
                                include_RF = TRUE,
                                include_SL_S = FALSE,
                                include_SL_T = FALSE,
                                include_CDML = TRUE,
                                include_XGBOOST = TRUE,
                                include_BART = TRUE,
                                master_seed = NULL,
                                baseline_seed = 68814, 
                                baseline_N = 100000, 
                                baseline_directory_path =  here::here("baseline"),
                                true_iates_directory_path = here::here("true_iates"),
                                today = Sys.Date() 
                                )

Set up

#Specify axis maximums, shape maps, etc

  #specifying axis maximums (x and y both) and breaks by datasets
  #update this when adding new datasets/outcomes
  scenario_maximums = rbind(
    #ca outcome 1 (continuous)
    data.frame(dataset = "ca",
               outcome_index = "1", 
               max = .350, 
               breaks = I(list(c(0,.050,.100,.150,.200,.250)))),
    
    #ca outcome 2 (binary)
    data.frame(dataset = "ca", 
               outcome_index = "2", 
               max = .15, 
               breaks = I(list(c(0,.05,.10)))),
  
    #asap outcome 2 (continuous)
    data.frame(dataset = "asap", 
               outcome_index = "2", 
               max = .25, 
               breaks=I(list(c(0,.05,.10,.15,.20)))))


# Function to make RMSE contour plots ----

TYPE_SHAPE_MAP <- c(
  "ATE" = 20, "OLS S" = 18, 
  "INF" = 2, "RF" = 3, "CDML" = 8,
  "LASSO" = 10, "SL" = 16,
  "XGBOOST" = 1, "BART" = 0
)

TYPE_COLOR_MAP <- c(
  "ATE" = "black", 
  "OLS S" = "black", 
  "INF" = "darkgrey", 
  "RF" = "#E69F00", 
  "CDML" = "#F0E442",
  "LASSO" = "#009E73", 
  "SL" = "#D55E00",
  "XGBOOST" = "#CC79A7", 
  "BART" = "#0072B2" # "#56B4E9"
)
LEGEND_COLORS = setdiff(names(TYPE_COLOR_MAP), c("LASSO INF", "RF INF", "ATE", "OLS S"))

# For individual methods
SHAPE_MAP <- c(
  "ATE" = 0, 
  "OLS S" = 1, 
  "RF INF" = 2, 
  "RF T" = 3, 
  "RF MOM IPW" = 4, 
  "RF MOM DR" = 5, 
  "CF" = 6, 
  "CF LC" = 7, 
  "CDML" = 8, 
  "LASSO INF" = 9, 
  "LASSO T" = 10, 
  "LASSO MOM IPW" = 11, 
  "LASSO MOM DR" = 12, 
  "LASSO MCM" = 13, 
  "LASSO MCM EA" = 14, 
  "LASSO R" = 15, 
  "SL T" = 16, 
  "SL S" = 17, 
  "XGBOOST S" = 18, 
  "XGBOOST R" = 19, 
  "BART T" = 20, 
  "BART S" = 21
)
df = combined_all
df$set_id = paste(df$dataset, 
                  df$outcome_index, 
                  df$cov_set_size, 
                  df$train_set_size, sep="-")

ALL_MODELS <- unique(df$model)

no_core <- function(df_set) {
  df_set %>% filter(!baseline)
}
only_core <- function(df_set) {
  df_set %>% filter(baseline)
}


# Mean of all queens that are already aggregated excluding ATE
df_agg = df %>%
                filter(queen != "ATE") %>% 
                group_by(set_id, dataset, outcome_index, 
                         cov_set_size, train_set_size,
                         model, type, modelshort) %>%
                summarise(bias = mean(bias),
                          SE = mean(se),
                          RMSE = mean(rmse),
                          .groups = "drop")

# Update naming and tags for clarity and shorter names ----

# Keep ATE and OLS as baseline, and drop labels for the baseline models
baseline_models = c("ATE", "OLS S", "LASSO INF", "RF INF")
df_agg <- mutate(df_agg,
                  baseline = model %in% baseline_models,
                  modelshort = ifelse(baseline, "", modelshort))

table(df_agg$modelshort)
## 
##            CDML      CF   CF LC     MCM  MCM EA  MOM DR MOM IPW       R       S 
##      72      18      18      18      18      18      36      36      25      32 
##       T 
##      61
df_agg$modelshort <- str_replace(df_agg$modelshort, "MOM ", "")
df_agg$modelshort <- str_replace(df_agg$modelshort, "CF LC", "CF-LC")
df_agg$modelshort <- str_replace(df_agg$modelshort, "MCM EA", "EA")

table(df_agg$modelshort)
## 
##        CDML    CF CF-LC    DR    EA   IPW   MCM     R     S     T 
##    72    18    18    18    36    18    36    18    25    32    61

Figures

Means across queens

CA continuous, small covariate set, train set = 1000

df_agg$se = df_agg$SE
scenario_plot(scen_dataset="ca", scen_outcome_index = "1", scen_train_set_size = "1000", scen_cov_set_size = "small")
## Warning: Contour data has duplicated x, y coordinates.
## ℹ 800 duplicated rows have been dropped.
## Warning: Removed 100 rows containing non-finite outside the scale range
## (`stat_contour()`).
## Scale for shape is already present.
## Adding another scale for shape, which will replace the existing scale.

CA continuous, medium covariate set, train set = 1000

scenario_plot(scen_dataset="ca", scen_outcome_index = "1", scen_train_set_size = "1000", scen_cov_set_size = "medium")
## Warning: Contour data has duplicated x, y coordinates.
## ℹ 800 duplicated rows have been dropped.
## Warning: Removed 100 rows containing non-finite outside the scale range
## (`stat_contour()`).
## Scale for shape is already present.
## Adding another scale for shape, which will replace the existing scale.

CA continuous, large covariate set, train set = 1000

NOTE: this removed CDML

scenario_plot(scen_dataset="ca", scen_outcome_index = "1", scen_train_set_size = "1000", scen_cov_set_size = "large")
## Warning: Contour data has duplicated x, y coordinates.
## ℹ 800 duplicated rows have been dropped.
## Warning: Removed 100 rows containing non-finite outside the scale range
## (`stat_contour()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_label_repel()`).
## Scale for shape is already present.
## Adding another scale for shape, which will replace the existing scale.

CA continuous, small covariate set, train set =2000

scenario_plot(scen_dataset="ca", scen_outcome_index = "1", scen_train_set_size = "2000", scen_cov_set_size = "small")
## Warning: Contour data has duplicated x, y coordinates.
## ℹ 800 duplicated rows have been dropped.
## Warning: Removed 100 rows containing non-finite outside the scale range
## (`stat_contour()`).
## Scale for shape is already present.
## Adding another scale for shape, which will replace the existing scale.

CA continuous, medium covariate set, train set = 2000

scenario_plot(scen_dataset="ca", scen_outcome_index = "1", scen_train_set_size = "2000", scen_cov_set_size = "medium")
## Warning: Contour data has duplicated x, y coordinates.
## ℹ 800 duplicated rows have been dropped.
## Warning: Removed 100 rows containing non-finite outside the scale range
## (`stat_contour()`).
## Scale for shape is already present.
## Adding another scale for shape, which will replace the existing scale.

CA continuous, large covariate set, train set = 2000

scenario_plot(scen_dataset="ca", scen_outcome_index = "1", scen_train_set_size = "2000", scen_cov_set_size = "large")
## Warning: Contour data has duplicated x, y coordinates.
## ℹ 800 duplicated rows have been dropped.
## Warning: Removed 100 rows containing non-finite outside the scale range
## (`stat_contour()`).
## Scale for shape is already present.
## Adding another scale for shape, which will replace the existing scale.

ASAP continuous, small covariate set, train set = 1000

scenario_plot(scen_dataset="asap", scen_outcome_index = "2", scen_train_set_size = "1000", scen_cov_set_size = "small")
## Warning: Contour data has duplicated x, y coordinates.
## ℹ 800 duplicated rows have been dropped.
## Warning: Removed 100 rows containing non-finite outside the scale range
## (`stat_contour()`).
## Scale for shape is already present.
## Adding another scale for shape, which will replace the existing scale.

CA binary, small covariate set, train set = 1000

scenario_plot(scen_dataset="ca", scen_outcome_index = "2", scen_train_set_size = 1000, scen_cov_set_size = "small")
## Warning: Contour data has duplicated x, y coordinates.
## ℹ 900 duplicated rows have been dropped.
## Warning: Removed 100 rows containing non-finite outside the scale range
## (`stat_contour()`).
## Scale for shape is already present.
## Adding another scale for shape, which will replace the existing scale.

Stacked mini plots for CA outcome 1

#set scen_max for ca outcome 1
  scen_max = scenario_maximums%>%filter(dataset == "ca"  & outcome_index == "1")%>% select(max) %>% unlist()

  #use scen_max to create euclidean_distance
  euclidean_distance=make_euclidean_distance(scen_max, scen_max)
  
# Adjusting your plot
# Define shapes for each model type

train_size_shape_map = c("1000"=9, "2000"=2, "5000"=10)
cov_set_size_color_map = c("small"="blue", "medium"="orange", "large" = "#696969")
usedata = df_agg %>%
  filter(dataset =="ca" & outcome_index ==1)

p <- ggplot(usedata, aes(SE, bias, xmax=.250, ymax=.250, breaks=.100)) +
  geom_contour(
    data = euclidean_distance,
    aes(x = x, y = y, z = z),
    breaks = c(.050, .100, .150, .200, .250),
    alpha = 0.6
) +
  geom_point(aes(color =cov_set_size, shape=train_set_size),  size = 2)  +
  facet_wrap(~ model) +  # Add facet_wrap 
  theme_minimal() +
  scale_x_continuous(limits=c(0,.350), breaks=c(0,.100, .200))+
  scale_y_continuous(breaks=c(0,.100, .200))+
  theme(
    plot.background = element_rect(fill = "white", color = NA),
    panel.background = element_rect(fill = "white", color = NA),
    text = element_text(color = "black"),
    plot.title = element_text(size = 12),
    plot.subtitle = element_text(size = 12),
    axis.title = element_text(size = 14),
    legend.title = element_text(size = 14),
    legend.text = element_text(size = 12),
    strip.text = element_text(size =8),
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)
) +
  labs(
    title = "Mean of Aggregated Bias and SE across Queens\nby Algorithm with RMSE Contouring",
    subtitle = "* ATE queen is excluded",
    col = "Scenario"
) +
  ylab("Bias") +    # Change X-axis title
  xlab("SE")  +     # Change Y-axis title
  scale_color_manual(values=cov_set_size_color_map)+
  scale_shape_manual(values = train_size_shape_map) +
  guides(color = guide_legend(title = "Covariate Set Size"),
         shape = guide_legend(title = "Train Set Size"))

p
## Warning: Contour data has duplicated x, y coordinates.
## ℹ 17600 duplicated rows have been dropped.
## Warning: Removed 2200 rows containing non-finite outside the scale range
## (`stat_contour()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

# ggsave(here::here(paste0("graphs/combined/stackminiplots.png")), p, width = 8, height = 6, units = "in", bg = 'white', limitsize=FALSE)

ATE and CDML queens only

# Adjusting your plot
usedata = combined_all %>%
  filter(dataset =="ca") %>%
  filter(outcome_index ==1) %>%
  filter(queen %in% c("ATE", "CDML"))

usedata$queensize = paste0(usedata$queen, ", ", usedata$train_set_size)
combined_shape_map = c("ATE, 1000"=0, "ATE, 2000"=1, "CDML, 1000" = 15, "CDML, 2000"=16)
cov_set_size_color_map = c("small"="blue", "medium"="orange", "large" = "#696969")


  scen_max = scenario_maximums%>%filter(dataset == "ca" & outcome_index == "1")%>% select(max) %>% unlist()
  
  #use scen_max to create euclidean_distance
  euclidean_distance=make_euclidean_distance(scen_max, scen_max)

  p <- ggplot(usedata, aes(se, bias, xmax=.250, ymax=.250, breaks=.100)) +
  geom_contour(
    data = euclidean_distance,
    aes(x = x, y = y, z = z),
    breaks = c(.050, .100, .150, .200, .250),
    alpha = 0.6
) +
  geom_point(aes(color =cov_set_size, shape=queensize),  size = 2)  +
  facet_wrap(~ model) +  # Add facet_wrap 
  theme_minimal() +
  scale_x_continuous(limits=c(0,.350), breaks=c(0,.100, .200))+
  scale_y_continuous(breaks=c(0,.100, .200))+
  theme(
    plot.background = element_rect(fill = "white", color = NA),
    panel.background = element_rect(fill = "white", color = NA),
    text = element_text(color = "black"),
    plot.title = element_text(size = 12),
    plot.subtitle = element_text(size = 12),
    axis.title = element_text(size = 14),
    legend.title = element_text(size = 14),
    legend.text = element_text(size = 12),
    strip.text = element_text(size =8),
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)
) +
  labs(
    title = "Aggregated Bias and SE\nby Algorithm with RMSE Contouring",
    subtitle = "ATE and CDML queens only",
    col = "Scenario"
) +
  ylab("Bias") +    # Change X-axis title
  xlab("SE")  +     # Change Y-axis title
  scale_color_manual(values = cov_set_size_color_map) +
  scale_shape_manual(values = combined_shape_map) +
  guides(color = guide_legend(title = "Covariate Set Size"),
         shape = guide_legend(title = "Train Set Size"))

p
## Warning: Contour data has duplicated x, y coordinates.
## ℹ 17600 duplicated rows have been dropped.
## Warning: Removed 2200 rows containing non-finite outside the scale range
## (`stat_contour()`).
## Warning: Removed 111 rows containing missing values or values outside the scale range
## (`geom_point()`).

# ggsave(here::here(paste0("graphs/combined/ATECDMLonly.png")), p, width = 8, height = 6, units = "in", bg = 'white', limitsize=FALSE)

All queens

#create scenario_allqueens function: plot the results from each queen for a given model and scenario 
scenario_allqueens = function(scen_dataset, scen_outcome_index, scen_train_set_size, scen_cov_set_size){ 

  scen_max = scenario_maximums%>%filter(dataset == scen_dataset & outcome_index == scen_outcome_index)%>% select(max) %>% unlist()
  max_bias = scen_max
  max_se = scen_max
  breaks = unlist(scenario_maximums%>%filter(dataset == scen_dataset & outcome_index == scen_outcome_index)%>% select(breaks))

  #use scen_max to create euclidean_distance
  euclidean_distance=make_euclidean_distance(scen_max, scen_max)
  
  usedata = combined_all[(combined_all$dataset ==scen_dataset & combined_all$outcome_index ==scen_outcome_index  & combined_all$train_set_size==scen_train_set_size & combined_all$cov_set_size == scen_cov_set_size),]
  p <- ggplot(usedata, aes(se, bias, xmax=scen_max, ymax=scen_max)) +
    geom_contour(
      data = euclidean_distance,
      aes(x = x, y = y, z = z),
      breaks = breaks,
      alpha = 0.6
) +
    geom_point(aes(color = queen, shape=queen),  size = 2) +
    scale_x_continuous(limits=c(0,scen_max))+
    scale_y_continuous(limits=c(0,scen_max))+ #adding this to remove some outlier CDML
    facet_wrap(~model) +
    theme_minimal() +
    theme(
      plot.background = element_rect(fill = "white", color = NA),
      panel.background = element_rect(fill = "white", color = NA),
      text = element_text(color = "black"),
      plot.title = element_text(size = 16),
      plot.subtitle = element_text(size = 12),
      axis.title = element_text(size = 14),
      legend.title = element_text(size = 14),
      legend.text = element_text(size = 12),
    strip.text = element_text(size =8),
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)
) +
    labs(
      title = "Aggregated Bias and SE by Queen\nfor Each Scenario with RMSE Contouring"
) +
    ylab("Bias") +    # Change X-axis title
    xlab("SE")  +     # Change Y-axis title
    scale_shape_manual(values = SHAPE_MAP) +
    guides(color = guide_legend(title = "Queen"),
           shape = guide_legend(title = "Queen"))
  
  print(p)
    
# ggsave(here::here(paste(sep="_","graphs/combined/allqueens",scen_dataset, scen_outcome_index, scen_train_set_size, scen_cov_set_size, ".png")), p, width = 8, height = 6, units = "in", bg = 'white', limitsize=FALSE)
}

CA continuous, small covariate set, train set = 1000

scenario_allqueens(scen_dataset="ca", scen_outcome_index = "1", scen_train_set_size = "1000", scen_cov_set_size = "small")
## Warning: Contour data has duplicated x, y coordinates.
## ℹ 33792 duplicated rows have been dropped.
## Warning: Removed 4026 rows containing non-finite outside the scale range
## (`stat_contour()`).

CA continuous, medium covariate set, train set = 1000

scenario_allqueens(scen_dataset="ca", scen_outcome_index = "1", scen_train_set_size = "1000", scen_cov_set_size = "medium")
## Warning: Contour data has duplicated x, y coordinates.
## ℹ 33792 duplicated rows have been dropped.
## Warning: Removed 4026 rows containing non-finite outside the scale range
## (`stat_contour()`).

CA continuous, large covariate set, train set = 1000

scenario_allqueens(scen_dataset="ca", scen_outcome_index = "1", scen_train_set_size = "1000", scen_cov_set_size = "large")
## Warning: Contour data has duplicated x, y coordinates.
## ℹ 33792 duplicated rows have been dropped.
## Warning: Removed 4026 rows containing non-finite outside the scale range
## (`stat_contour()`).
## Warning: Removed 18 rows containing missing values or values outside the scale range
## (`geom_point()`).

CA continuous, small covariate set, train set =2000

scenario_allqueens(scen_dataset="ca", scen_outcome_index = "1", scen_train_set_size = "2000", scen_cov_set_size = "small")
## Warning: Contour data has duplicated x, y coordinates.
## ℹ 33792 duplicated rows have been dropped.
## Warning: Removed 4026 rows containing non-finite outside the scale range
## (`stat_contour()`).

CA continuous, medium covariate set, train set = 2000

scenario_allqueens(scen_dataset="ca", scen_outcome_index = "1", scen_train_set_size = "2000", scen_cov_set_size = "medium")
## Warning: Contour data has duplicated x, y coordinates.
## ℹ 33792 duplicated rows have been dropped.
## Warning: Removed 4026 rows containing non-finite outside the scale range
## (`stat_contour()`).

CA continuous, large covariate set, train set = 2000

scenario_allqueens(scen_dataset="ca", scen_outcome_index = "1", scen_train_set_size = "2000", scen_cov_set_size = "large")
## Warning: Contour data has duplicated x, y coordinates.
## ℹ 33792 duplicated rows have been dropped.
## Warning: Removed 4026 rows containing non-finite outside the scale range
## (`stat_contour()`).

ASAP continuous, small covariate set, train set = 1000

CDML dropped here because bias = ~50

scenario_allqueens(scen_dataset="asap", scen_outcome_index = "2", scen_train_set_size = "1000", scen_cov_set_size = "small")
## Warning: Contour data has duplicated x, y coordinates.
## ℹ 30720 duplicated rows have been dropped.
## Warning: Removed 3660 rows containing non-finite outside the scale range
## (`stat_contour()`).
## Warning: Removed 40 rows containing missing values or values outside the scale range
## (`geom_point()`).

CA binary, small covariate set, train set = 1000

scenario_allqueens(scen_dataset="ca", scen_outcome_index = "2", scen_train_set_size = 1000, scen_cov_set_size = "small")
## Warning: Contour data has duplicated x, y coordinates.
## ℹ 34380 duplicated rows have been dropped.
## Warning: Removed 3620 rows containing non-finite outside the scale range
## (`stat_contour()`).

Figures from Luke

Create contour plot

# Make contour plot ----
df_agg$se = df_agg$SE
df_set_1 <- filter(df_agg, set_id == "ca-1-large-1000")
df_set_2 <- filter(df_agg, set_id == "ca-1-large-2000")

#max_bias = max(df_set_1$bias, df_set_2$bias)
#max_se = max(df_set_1$se, df_set_2$se)
  scen_max = scenario_maximums%>%filter(dataset == "ca" & outcome_index == "1")%>% select(max) %>% unlist()
  max_bias = scen_max
  max_se = scen_max
  breaks = unlist(scenario_maximums%>%filter(dataset == "ca" & outcome_index == "1")%>% select(breaks))


p1 <- make_contour_plot(df_set_1, max_bias=max_bias, max_se=max_se , breaks = breaks)
p1

p2 <- make_contour_plot(df_set_2, max_bias=max_bias, max_se = max_se, breaks = breaks)
p2

# Alternate labeling approaches? ----

p1B <- make_contour_plot(df_set_1, max_bias=max_bias, max_se=max_se, breaks = breaks, add_labels = FALSE)
p1B + ggrepel::geom_text_repel(
  aes(label = modelshort, col = type),
  size = 2, # Increased label size
  max.overlaps = Inf,
  show.legend = FALSE
)

gbs = filter(df_set_1, model %in% c("ATE", "OLS S", "LASSO INF", "RF INF"))
gbs$model
## [1] "ATE"       "LASSO INF" "OLS S"     "RF INF"

p1C (TODO: better title?)

p1C <- ggplot(no_core(df_set_1), aes(se, bias, xmax=max_se, ymax=max_bias)) +
  contour_plot_base(max_bias = max_bias, max_se = max_se , breaks = breaks) +
  geom_point(aes(color = type), size = 4, shape=19) +
  geom_point(data = gbs, aes(pch=model), col="darkgrey", fill="darkgrey", size=4) +
#  geom_point(data = gbs, size = 7, shape = 1, col="red") +
  scale_shape_manual(breaks = c("ATE", "OLS S", "LASSO INF", "RF INF"),
                     values = c(15, 23, 25, 24)) +
  guides(color = guide_legend(title = "Model Type"),
         shape = guide_legend(title = "Reference"))
## Scale for shape is already present.
## Adding another scale for shape, which will replace the existing scale.
p1C

p1C + ggrepel::geom_text_repel(
  aes(label = modelshort, col = type),
  size = 3, # Increased label size
  max.overlaps = Inf,
  min.segment.length = Inf,
  show.legend = FALSE
)

#ggsave(here::here("graphs/possible_clean_plot_1.pdf"))

Animation (isn’t working for now)

if (FALSE) {
  # This isn't working
  
# Trying out animation ----

library(ggplot2)
library(gganimate)

df_range = bind_rows(set1 = df_set_1, set2 = df_set_2, .id = "set") %>%
 # dplyr::select(-n, -rmse) %>%
  group_by(model, type, modelshort)

# Plot and animate

p <- ggplot(df_range, aes(se, bias, xmax=max_se, ymax=max_bias)) +
  geom_point(aes(color = type, shape=type),  size = 4)  +
  transition_states(set, transition_length = 2, state_length = 1, wrap=FALSE) +
  ease_aes('linear') +
  labs(title = "Scenario: {closest_state}") +
  contour_plot_base(max_bias = max_bias, max_se = max_se , breaks = breaks) +
  theme(legend.position = "none")

p


anim <- animate(p, duration = 5, fps = 10, width = 600, height = 400)

gganimate::anim_save(here::here("graphs/initial_animation.gif"))



  # Extract the first frame
  first_frame <- anim::anim_save(anim, start_pause = 1, nframes = 1)
  
  # Extract the last frame
  last_frame <- anim_save(anim, end_pause = 1, nframes = 1)
  
  # Add labels to the first frame
  first_frame_labeled <- ggplot_build(first_frame) +
    ggrepel::geom_label_repel(
      aes(label = after_stat(ifelse(set %in% c("set1", "set2"), modelshort, NA)), color = type),
      size = 3, # Increased label size
      box.padding = 0.75,
      point.padding = 0.25,
      max.overlaps = Inf,
      show.legend = FALSE
)   
  
  geom_label_repel(aes(label = modelshort, color = type), size = 5)
  
  # Add labels to the last frame
  last_frame_labeled <- ggplot_build(last_frame) +
    geom_label_repel(aes(label = modelshort, color = type), size = 5)
  
  # You can now save or display these labeled frames
#  ggsave("first_frame_with_labels.png", plot = first_frame_labeled)
#  ggsave("last_frame_with_labels.png", plot = last_frame_labeled)
}

“Trails from last plot” plot (two versions) —-

d_b = df_set_2 %>%
  ungroup() %>%
  dplyr::select(-c(set_id:train_set_size)) %>%
  left_join(dplyr::select(ungroup(df_set_1), model, bias, se), 
             by = "model",
             suffix = c("", "old"))


plt <-  ggplot(d_b, aes(se, bias, xmax=max_se, ymax=max_bias)) +
  geom_segment(aes(x = seold, y = biasold, xend = se, yend = bias),
               arrow = arrow(length = unit(0.2, "cm")), lwd = 1, col="darkgrey") +
  geom_point(aes(color = type, shape=type, size = as.numeric(3+1*baseline)), fill="grey") + 
  contour_plot_base(max_bias = max_bias, max_se = max_se , breaks = breaks) +
  labs(title = "Trails of Models from 1000 to 2000") + 
  scale_size_identity() +
  guides(size="none")
plt

# A second version

pltB <-  ggplot(d_b, aes(se, bias, xmax=max_se, ymax=max_bias)) +
  #geom_point(aes(color = type, shape=type, size = as.numeric(4+2*baseline)), fill="grey") + 
  geom_segment(aes(x = seold, y = biasold, xend = se, yend = bias),
               arrow = arrow(length = unit(0.2, "cm")), lwd = 1, col="lightgrey") +
  geom_point(data = no_core(d_b), aes(color = type), size = 4, shape=19) +
  geom_point(data = only_core(d_b), aes(pch=model), col="darkgrey", fill="darkgrey", size=4) +
  contour_plot_base(max_bias = max_bias, max_se = max_se , breaks = breaks) +
  labs(title = "Trails of Models from 1000 to 2000") + 
  scale_size_identity() +
  scale_shape_manual(breaks = c("ATE", "OLS S", "LASSO INF", "RF INF"),
                     values = c(15, 23, 25, 24)) +
  guides(color = guide_legend(title = "Model Type"),
         shape = guide_legend(title = "Reference"),
         size="none")
## Scale for shape is already present.
## Adding another scale for shape, which will replace the existing scale.
pltB

pltB + ggrepel::geom_text_repel(
  aes(label = modelshort, col = type),
  size = 3, # Increased label size
  max.overlaps = Inf,
  min.segment.length = Inf,
  show.legend = FALSE
)

pltB

#ggsave(here::here("graphs/possible_clean_plot_2.pdf"))